Wittenberg University - Master of Science in Analytics

ANLT 510 - Advanced Statistics and Modeling

Day 1: Feature & Target Engineering

03 Oct 2021

Feature & Target Engineering

Introduction

Prerequisites

# packages
library(modeldata)
library(AmesHousing)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rsample)
library(recipes)
library(forecast)
# ames data
ames = AmesHousing::make_ames()
# To view a snapshot of this data set run:
### DT::datatable(head(ames, n = 20))

# split data
set.seed(123)
split <- rsample::initial_split(ames, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test  <- rsample::testing(split)

Target Engineering

Normality correction

.center[“taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.”]

\[ \texttt{Sale_Price} = \beta_0 + \beta_1\texttt{Year_Built} + \epsilon \]

Transformation options

\[ \begin{equation} y(\lambda) = \begin{cases} \frac{y^\lambda-1}{\lambda}, & \text{if}\ \lambda \neq 0 \\ \log y, & \text{if}\ \lambda = 0. \end{cases} \end{equation} \]

step_log()
step_BoxCox()
step_YeoJohnson()

Missingness

Overview

sum(is.na(AmesHousing::ames_raw))
[1] 13997
AmesHousing::ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) + 
    geom_raster() + 
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name = "", labels = c("Present", "Missing")) +
    xlab("Observation") +
    theme(axis.text.y  = element_text(size = 4))

Structural vs random

AmesHousing::ames_raw %>% 
  filter(is.na(`Garage Type`)) %>% 
  select(`Garage Type`, `Garage Cars`, `Garage Area`)
# A tibble: 157 x 3
   `Garage Type` `Garage Cars` `Garage Area`
   <chr>                 <int>         <int>
 1 <NA>                      0             0
 2 <NA>                      0             0
 3 <NA>                      0             0
 4 <NA>                      0             0
 5 <NA>                      0             0
 6 <NA>                      0             0
 7 <NA>                      0             0
 8 <NA>                      0             0
 9 <NA>                      0             0
10 <NA>                      0             0
# ... with 147 more rows

Imputation

step_meanimpute()
step_medianimpute()
step_modeimpute()
step_knnimpute()
step_bagimpute()

Feature Filtering

More is not always better!

Options for filtering

caret::nearZeroVar(ames_train, saveMetrics= TRUE) %>% 
  rownames_to_column() %>% 
  filter(nzv)
              rowname  freqRatio percentUnique zeroVar  nzv
1              Street  198.72727    0.09103323   FALSE TRUE
2               Alley   22.70000    0.13654984   FALSE TRUE
3        Land_Contour   20.01020    0.18206645   FALSE TRUE
4           Utilities 1097.00000    0.13654984   FALSE TRUE
5          Land_Slope   20.87000    0.13654984   FALSE TRUE
6         Condition_2  217.50000    0.31861629   FALSE TRUE
7           Roof_Matl  135.50000    0.27309968   FALSE TRUE
8           Bsmt_Cond   23.34524    0.27309968   FALSE TRUE
9      BsmtFin_Type_2   25.06667    0.31861629   FALSE TRUE
10       BsmtFin_SF_2  486.00000    9.46745562   FALSE TRUE
11            Heating   93.82609    0.27309968   FALSE TRUE
12    Low_Qual_Fin_SF  540.50000    1.41101502   FALSE TRUE
13      Kitchen_AbvGr   21.60825    0.18206645   FALSE TRUE
14         Functional   37.75926    0.36413291   FALSE TRUE
15     Enclosed_Porch  124.53333    7.28265817   FALSE TRUE
16 Three_season_porch  724.00000    1.04688211   FALSE TRUE
17       Screen_Porch  182.00000    4.64269458   FALSE TRUE
18          Pool_Area 2186.00000    0.54619936   FALSE TRUE
19            Pool_QC  546.50000    0.22758307   FALSE TRUE
20       Misc_Feature   32.66154    0.27309968   FALSE TRUE
21           Misc_Val  212.60000    1.54756486   FALSE TRUE
step_zv()
step_nzv()
step_corr()

Numeric Feature Engineering

Transformations

step_log()
step_BoxCox()
step_YeoJohnson()
step_center()
step_scale()

Categorical Feature Engineering

One-hot & Dummy encoding

id x
1 c
2 c
3 c
4 b
id xc
1 1
2 1
3 1
4 0

Label encoding

ames_train %>% select(matches("Qual|QC|Qu"))
# A tibble: 2,197 x 9
   Overall_Qual  Exter_Qual Bsmt_Qual   Heating_QC Low_Qual_Fin_SF Kitchen_Qual
   <fct>         <fct>      <fct>       <fct>                <int> <fct>       
 1 Below_Average Typical    Typical     Typical                  0 Typical     
 2 Average       Typical    Typical     Typical                  0 Typical     
 3 Above_Average Typical    Typical     Typical                  0 Typical     
 4 Above_Average Typical    Good        Typical                  0 Typical     
 5 Good          Typical    Good        Excellent                0 Typical     
 6 Above_Average Typical    Good        Typical                  0 Typical     
 7 Average       Typical    Typical     Typical                  0 Typical     
 8 Above_Average Typical    No_Basement Typical                  0 Typical     
 9 Average       Typical    Typical     Typical                  0 Typical     
10 Average       Typical    Typical     Excellent                0 Typical     
# ... with 2,187 more rows, and 3 more variables: Fireplace_Qu <fct>,
#   Garage_Qual <fct>, Pool_QC <fct>
count(ames_train, Overall_Qual)
# A tibble: 10 x 2
   Overall_Qual       n
   <fct>          <int>
 1 Very_Poor          3
 2 Poor               9
 3 Fair              36
 4 Below_Average    166
 5 Average          607
 6 Above_Average    554
 7 Good             455
 8 Very_Good        266
 9 Excellent         77
10 Very_Excellent    24
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_integer(Overall_Qual) %>%
  prep(ames_train) %>%
  bake(ames_train) %>%
  count(Overall_Qual)
# A tibble: 10 x 2
   Overall_Qual     n
          <dbl> <int>
 1            1     3
 2            2     9
 3            3    36
 4            4   166
 5            5   607
 6            6   554
 7            7   455
 8            8   266
 9            9    77
10           10    24

Common categorical encodings

We’ll put these pieces together later

step_dummy()
step_dummy(one_hot = TRUE)
step_integer()
step_ordinalscore()

Dimension Reduction

PCA

step_pca()
step_kpca()
step_pls()
step_spatialsign()

Blueprints

Sequential steps

Data leakage

Putting the process together

  • The recipes package link provides a convenient way to create feature engineering blueprints

  • 3 main components to consider

    1. recipe: define your pre-processing blueprint
    2. prepare: estimate parameters based on training data
    3. bake/juice: apply blueprint to new data
  • Check out all the available step_xxx() functions at http://bit.ly/step_functions

blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_nominal()) %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes()) %>%
  step_integer(matches("Qual|Cond|QC|Qu"))

blueprint
Data Recipe

Inputs:

      role #variables
   outcome          1
 predictor         80

Operations:

Sparse, unbalanced variable filter on all_nominal()
Centering for all_numeric(), -all_outcomes()
Scaling for all_numeric(), -all_outcomes()
Integer encoding for matches("Qual|Cond|QC|Qu")
prepare <- prep(blueprint, training = ames_train)
prepare
Data Recipe

Inputs:

      role #variables
   outcome          1
 predictor         80

Training data contained 2197 data points and no missing data.

Operations:

Sparse, unbalanced variable filter removed Street, Alley, Land_Contour, ... [trained]
Centering for Lot_Frontage, Lot_Area, ... [trained]
Scaling for Lot_Frontage, Lot_Area, ... [trained]
Integer encoding for Condition_1, Overall_Qual, Overall_Cond, ... [trained]
baked_train <- bake(prepare, new_data = ames_train)
baked_test <- bake(prepare, new_data = ames_test)

baked_train
# A tibble: 2,197 x 68
   MS_SubClass MS_Zoning Lot_Frontage Lot_Area Lot_Shape Lot_Config Neighborhood
   <fct>       <fct>            <dbl>    <dbl> <fct>     <fct>      <fct>       
 1 One_Story_~ Resident~        0.370  -0.205  Regular   Corner     North_Ames  
 2 Two_Story_~ Resident~       -1.08   -0.984  Regular   Inside     Briardale   
 3 Two_Story_~ Resident~       -1.08   -0.984  Regular   Inside     Briardale   
 4 One_Story_~ Resident~       -0.135  -0.710  Regular   Inside     Northpark_V~
 5 One_Story_~ Resident~       -0.996  -0.914  Regular   FR2        Northpark_V~
 6 One_Story_~ Resident~       -0.224  -0.347  Regular   Inside     Sawyer_West 
 7 One_Story_~ Resident~        0.370  -0.0422 Regular   Corner     Sawyer_West 
 8 Duplex_All~ Resident~        0.310  -0.143  Regular   Inside     Sawyer      
 9 One_Story_~ Resident~       -1.71   -0.0400 Slightly~ Inside     Sawyer      
10 One_Story_~ Resident~       -1.71   -0.379  Slightly~ Corner     Sawyer      
# ... with 2,187 more rows, and 61 more variables: Condition_1 <dbl>,
#   Bldg_Type <fct>, House_Style <fct>, Overall_Qual <dbl>, Overall_Cond <dbl>,
#   Year_Built <dbl>, Year_Remod_Add <dbl>, Roof_Style <fct>,
#   Exterior_1st <fct>, Exterior_2nd <fct>, Mas_Vnr_Type <fct>,
#   Mas_Vnr_Area <dbl>, Exter_Qual <dbl>, Exter_Cond <dbl>, Foundation <fct>,
#   Bsmt_Qual <dbl>, Bsmt_Exposure <fct>, BsmtFin_Type_1 <fct>,
#   BsmtFin_SF_1 <dbl>, BsmtFin_SF_2 <dbl>, Bsmt_Unf_SF <dbl>, ...

Simplifying with caret

  • recipes provides a convenient way to create feature engineering blueprints

  • 3 main components to consider

    1. recipe: define your pre-processing blueprint
    2. prepare: estimate parameters based on training data
    3. bake: apply blueprint to new data
  • Luckily, caret simplifies this process for us.

    1. We supply caret a recipe
    2. caret will prepare & bake within each resample

Putting the process together

# 1. stratified sampling with the rsample package
set.seed(123)
split  <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)
ames_test   <- testing(split)
# 2. Feature engineering
blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_nominal()) %>%
  step_integer(matches("Qual|Cond|QC|Qu")) %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)
# 3. create a resampling method
cv <- caret::trainControl(
  method = "repeatedcv", 
  number = 10, 
  repeats = 5
  )
# 4. create a hyperparameter grid search
hyper_grid <- expand.grid(k = seq(2, 25, by = 1))
# 5. execute grid search with knn model
#    use RMSE as preferred metric
knn_fit <- caret::train(
  blueprint, 
  data = ames_train, 
  method = "knn", 
  trControl = cv, 
  tuneGrid = hyper_grid,
  metric = "RMSE"
  )
# 6. evaluate results
# print model results
knn_fit
k-Nearest Neighbors 

2049 samples
  80 predictor

Recipe steps: nzv, integer, center, scale, dummy 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 1844, 1845, 1845, 1843, 1844, 1844, ... 
Resampling results across tuning parameters:

  k   RMSE      Rsquared   MAE     
   2  35240.61  0.8132590  22623.02
   3  34165.90  0.8269893  21795.42
   4  34344.58  0.8274891  21602.65
   5  34275.26  0.8310075  21468.52
   6  34116.37  0.8345686  21233.82
   7  33773.92  0.8393139  20997.91
   8  33442.06  0.8439293  20832.61
   9  33324.23  0.8459464  20730.40
  10  33298.62  0.8478648  20765.17
  11  33434.22  0.8473373  20857.19
  12  33517.24  0.8474209  20924.11
  13  33591.58  0.8473141  21006.68
  14  33706.79  0.8470867  21102.25
  15  33792.83  0.8470461  21150.34
  16  33995.48  0.8457246  21224.70
  17  34139.36  0.8447019  21285.43
  18  34250.49  0.8439190  21346.94
  19  34322.29  0.8439623  21391.36
  20  34445.59  0.8431411  21442.68
  21  34511.36  0.8429353  21482.19
  22  34620.72  0.8422083  21558.53
  23  34712.69  0.8417518  21615.48
  24  34800.93  0.8414322  21670.65
  25  34872.12  0.8411673  21721.18

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was k = 10.
# plot cross validation results
ggplot(knn_fit$results, aes(k, RMSE)) + 
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = scales::dollar)

Putting the process together

Questions?